'''
  1，将目标网站的页面抓取
  2，将页面数据提取

  res.text 解码后的 是Unicode 字符串
  res.content 原生的字符串，bytes类型
'''
import requests
from lxml import etree
import pandas as pd
from pandas import DataFrame

headers = {
  "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
  "Referer": "https://movie.douban.com/"
}
url = 'https://movie.douban.com/cinema/nowplaying/langfang/'


response = requests.get(url, headers=headers)

text = response.text
# print(response.text)
# print(response.content.decode('utf-8'))

html = etree.HTML(text)

ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies = []
movies_df = {
  'title':[],
  'score':[],
  'actors':[],
  'director':[],
  'thum':[]
}
for li in lis:
  title = li.xpath("@data-title")[0]
  score = li.xpath("@data-score")[0]
  actors = li.xpath('@data-actors')[0]
  director = li.xpath('@data-director')[0]
  thum = li.xpath('.//img/@src')[0]
  movies_df['title'].append(title)
  movies_df['score'].append(score)
  movies_df['actors'].append(actors)
  movies_df['director'].append(director)
  movies_df['thum'].append(thum)
  # movies.append({
  #   'title':title,
  #   'score': score,
  #   'actors': actors,
  #   'thum': thum
  # })
print(movies_df)
  
pf = pd.DataFrame(movies_df)
pf.to_excel('./1.xlsx')



